import pandas as pd
import matplotlib.pyplot as plt

import seaborn as sns
from sklearn.linear_model import LinearRegression


pd.options.display.float_format = '{:,.2f}'.format

from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()


data = pd.read_csv('cost_revenue_dirty.csv')


data.head()


data.shape

(5391, 6)


data.isna().values.any()

False


data.duplicated().values.any()

False


data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5391 entries, 0 to 5390
Data columns (total 6 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   Rank                   5391 non-null   int64 
 1   Release_Date           5391 non-null   object
 2   Movie_Title            5391 non-null   object
 3   USD_Production_Budget  5391 non-null   object
 4   USD_Worldwide_Gross    5391 non-null   object
 5   USD_Domestic_Gross     5391 non-null   object
dtypes: int64(1), object(5)
memory usage: 252.8+ KB


# data.USD_Production_Budget = data.USD_Production_Budget.str.replace('$', '').str.replace(',', '')
# data.USD_Production_Budget = pd.to_numeric(data.USD_Production_Budget)

# data.USD_Worldwide_Gross = data.USD_Worldwide_Gross.str.replace('$', '').str.replace(',', '')
# data.USD_Worldwide_Gross = pd.to_numeric(data.USD_Worldwide_Gross)

# data.USD_Domestic_Gross = data.USD_Domestic_Gross.str.replace('$', '').str.replace(',', '')
# data.USD_Domestic_Gross = pd.to_numeric(data.USD_Domestic_Gross)

# data.info()


chars_to_remove = [',', '$']
columns_to_clean = ['USD_Production_Budget', 
                    'USD_Worldwide_Gross',
                    'USD_Domestic_Gross']
 
for col in columns_to_clean:
    for char in chars_to_remove:
        # Replace each character with an empty string
        data[col] = data[col].astype(str).str.replace(char, "")
    # Convert column to a numeric data type
    data[col] = pd.to_numeric(data[col])

/var/folders/mj/365hbdq166n0n8b1hxp26g9h0000gn/T/ipykernel_99464/3675212536.py:9: FutureWarning: The default value of regex will change from True to False in a future version. In addition, single character regular expressions will *not* be treated as literal strings when regex=True.
  data[col] = data[col].astype(str).str.replace(char, "")


data.Release_Date = pd.to_datetime(data.Release_Date)

data.Release_Date.head()

0   1915-08-02
1   1916-05-09
2   1916-12-24
3   1920-09-17
4   1925-01-01
Name: Release_Date, dtype: datetime64[ns]


data.describe()


# Highest Production Budget Film
data[data.USD_Production_Budget == 425_000_000]


# Highest Worldwide Gross Revenue
data[data.USD_Worldwide_Gross == 2_783_918_982]


# Revenue of Lowest Budget Film
data[data.USD_Production_Budget == 1100]


data[data.USD_Domestic_Gross == 0].count()

Rank                     512
Release_Date             512
Movie_Title              512
USD_Production_Budget    512
USD_Worldwide_Gross      512
USD_Domestic_Gross       512
dtype: int64


data[data.USD_Domestic_Gross == 0].sort_values("USD_Production_Budget", ascending=False).head(10)


data[data.USD_Worldwide_Gross == 0].count()

Rank                     357
Release_Date             357
Movie_Title              357
USD_Production_Budget    357
USD_Worldwide_Gross      357
USD_Domestic_Gross       357
dtype: int64


data[data.USD_Worldwide_Gross == 0].sort_values("USD_Production_Budget", ascending=False).head(10)


international_releases = data.loc[(data.USD_Domestic_Gross == 0) & (data.USD_Worldwide_Gross != 0)]  # &: Bitwise And
print(f"Number: {len(international_releases)}")
international_releases.head()

Number: 155


data.query("USD_Domestic_Gross == 0 and USD_Worldwide_Gross != 0").head()


# Date of Data Collection
scrape_date = pd.Timestamp('2018-5-1')


# data_clean = data[data.Release_Date < scrape_date]
future_releases = data.loc[data.Release_Date >= scrape_date]
print(f"Number of Unreleased Films: {len(future_releases)}")
data_clean = data.drop(future_releases.index)
data_clean.head()

Number of Unreleased Films: 7


losing_money = data_clean.query("USD_Production_Budget > USD_Worldwide_Gross")
(len(losing_money)/len(data_clean)) * 100

37.27711738484398


plt.figure(figsize=(8, 4), dpi=200)

sns.scatterplot(data=data_clean,
                x='USD_Production_Budget', 
                y='USD_Worldwide_Gross')

<AxesSubplot:xlabel='USD_Production_Budget', ylabel='USD_Worldwide_Gross'>


plt.figure(figsize=(8, 4), dpi=200)

ax = sns.scatterplot(data=data_clean,
                x='USD_Production_Budget', 
                y='USD_Worldwide_Gross')

ax.set(ylabel="Revenue in $ Billion",
      xlabel="Budget in $100 Million")

plt.show()


plt.figure(figsize=(8,4), dpi=200)
 
# set styling on a single chart
with sns.axes_style('darkgrid'):  # Axis colours
  ax = sns.scatterplot(data=data_clean,
                       x='USD_Production_Budget', 
                       y='USD_Worldwide_Gross',
                       hue='USD_Worldwide_Gross', # Colour of Dots
                       size='USD_Worldwide_Gross') # Size of Dots
 
  ax.set(ylim=(0, 3000000000),
        xlim=(0, 450000000),
        ylabel='Revenue in $ billions',
        xlabel='Budget in $100 millions')


plt.figure(figsize=(8,4), dpi=200)
 
# set styling on a single chart
with sns.axes_style('whitegrid'):  # Axis colours
  ax = sns.scatterplot(data=data_clean,
                       x='Release_Date', 
                       y='USD_Production_Budget',
                       hue='USD_Worldwide_Gross', # Colour of Dots
                       size='USD_Worldwide_Gross') # Size of Dots
 
  ax.set(xlim=(data_clean.Release_Date.min(), data_clean.Release_Date.max()),        
        ylabel='Budget in $100 millions',
        xlabel='Year')


dt_index = pd.DatetimeIndex(data_clean.Release_Date)
years = dt_index.year


# Converting 1999 to 1990 Decade: 1999//10 = 199*10 = 1990
decades = (years // 10) * 10
data_clean["Decade"] = decades
data_clean.head()


# data_clean.drop("Decades", axis=1, inplace=True)
# data_clean.head()


old_films = data_clean[data_clean.Decade < 1970]
new_films = data_clean[data_clean.Decade >= 1970]
print(f"Num films before 1970: {len(old_films)}")
print(f"Most Expensive Film before 1970: {old_films.loc[old_films.USD_Production_Budget == old_films.USD_Production_Budget.max()]}")

Num films before 1970: 153
Most Expensive Film before 1970:      Rank Release_Date Movie_Title  USD_Production_Budget  \
109  1253   1963-12-06   Cleopatra               42000000   

     USD_Worldwide_Gross  USD_Domestic_Gross  Decade  
109             71000000            57000000    1960


plt.figure(figsize=(8,4), dpi=200)
with sns.axes_style("whitegrid"):
  sns.regplot(data=old_films, 
            x='USD_Production_Budget', 
            y='USD_Worldwide_Gross',
            scatter_kws = {'alpha': 0.4},
            line_kws = {'color': 'black'})


plt.figure(figsize=(8,4), dpi=200)
with sns.axes_style('darkgrid'):
  ax = sns.regplot(data=new_films,
                   x='USD_Production_Budget',
                   y='USD_Worldwide_Gross',
                   color='#2f4b7c',
                   scatter_kws = {'alpha': 0.3},
                   line_kws = {'color': '#ff7c43'})
  
  ax.set(ylim=(0, 3000000000),
         xlim=(0, 450000000),
         ylabel='Revenue in $ billions',
         xlabel='Budget in $100 millions')


regression = LinearRegression()

# Explanatory Variable(s) or Feature(s)
X = pd.DataFrame(new_films, columns=['USD_Production_Budget'])
 
# Response Variable or Target
y = pd.DataFrame(new_films, columns=['USD_Worldwide_Gross'])


regression.fit(X, y)

LinearRegression()


# Theta 0: y-intercept
regression.intercept_

array([-8650768.00661024])


# Theta 1: Slope of the regression line
regression.coef_

array([[3.12259592]])


# R-squared
regression.score(X, y)

0.5577032617720403


regression = LinearRegression()

# Explanatory Variable(s) or Feature(s)
X = pd.DataFrame(old_films, columns=['USD_Production_Budget'])
 
# Response Variable or Target
y = pd.DataFrame(old_films, columns=['USD_Worldwide_Gross']) 

regression.fit(X, y)

LinearRegression()


# R-squared
regression.score(X, y)

0.02937258620576877


22821538 + 1.64771314 * 350000000

599521137.0


budget = 350000000
revenue_estimate = regression.intercept_[0] + regression.coef_[0,0]*budget
revenue_estimate = round(revenue_estimate, -6)
print(f'The estimated revenue for a $350 film is around ${revenue_estimate:.10}.')

The estimated revenue for a $350 film is around $600000000.0.

	Rank	Release_Date	Movie_Title	USD_Production_Budget	USD_Worldwide_Gross	USD_Domestic_Gross
0	5293	8/2/1915	The Birth of a Nation	$110,000	$11,000,000	$10,000,000
1	5140	5/9/1916	Intolerance	$385,907	$0	$0
2	5230	12/24/1916	20,000 Leagues Under the Sea	$200,000	$8,000,000	$8,000,000
3	5299	9/17/1920	Over the Hill to the Poorhouse	$100,000	$3,000,000	$3,000,000
4	5222	1/1/1925	The Big Parade	$245,000	$22,000,000	$11,000,000

	Rank	USD_Production_Budget	USD_Worldwide_Gross	USD_Domestic_Gross
count	5,391.00	5,391.00	5,391.00	5,391.00
mean	2,696.00	31,113,737.58	88,855,421.96	41,235,519.44
std	1,556.39	40,523,796.88	168,457,757.00	66,029,346.27
min	1.00	1,100.00	0.00	0.00
25%	1,348.50	5,000,000.00	3,865,206.00	1,330,901.50
50%	2,696.00	17,000,000.00	27,450,453.00	17,192,205.00
75%	4,043.50	40,000,000.00	96,454,455.00	52,343,687.00
max	5,391.00	425,000,000.00	2,783,918,982.00	936,662,225.00

	Rank	Release_Date	Movie_Title	USD_Production_Budget	USD_Worldwide_Gross
5388	96	2020-12-31	Singularity	175000000	0
5387	126	2018-12-18	Aquaman	160000000	0
5384	321	2018-09-03	A Wrinkle in Time	103000000	0
5385	366	2018-10-08	Amusement Park	100000000	0
5090	556	2015-12-31	Don Gato, el inicio de la pandilla	80000000	4547660
4294	566	2012-12-31	Astérix et Obélix: Au service de Sa Majesté	77600000	60680125
5058	880	2015-11-12	The Ridiculous 6	60000000	0
5338	879	2017-04-08	The Dark Tower	60000000	0
5389	1119	2020-12-31	Hannibal the Conqueror	50000000	0
4295	1230	2012-12-31	Foodfight!	45000000	73706

	Rank	Release_Date	Movie_Title	USD_Production_Budget
5388	96	2020-12-31	Singularity	175000000
5387	126	2018-12-18	Aquaman	160000000
5384	321	2018-09-03	A Wrinkle in Time	103000000
5385	366	2018-10-08	Amusement Park	100000000
5058	880	2015-11-12	The Ridiculous 6	60000000
5338	879	2017-04-08	The Dark Tower	60000000
5389	1119	2020-12-31	Hannibal the Conqueror	50000000
5092	1435	2015-12-31	The Crow	40000000
3300	1631	2008-12-31	Black Water Transit	35000000
5045	1656	2015-10-30	Freaks of Nature	33000000

	Rank	Release_Date	Movie_Title	USD_Production_Budget	USD_Worldwide_Gross
71	4310	1956-02-16	Carousel	3380000	3220
1579	5087	2001-02-11	Everything Put Together	500000	7890
1744	3695	2001-12-31	The Hole	7500000	10834406
2155	4236	2003-12-31	Nothing	4000000	63180
2203	2513	2004-03-31	The Touch	20000000	5918742

Linear Regression & Seaborn: Introduction¶

Import Statements¶

Notebook Presentation¶

Read the Data¶

Explore and Clean the Data¶

Data Type Conversions¶

Descriptive Statistics¶

Investigating the Zero Revenue Films¶

Filtering on Multiple Conditions¶

Unreleased Films¶

Films that Lost Money¶

Seaborn for Data Viz: Bubble Charts¶

Plotting Movie Releases over Time¶

Converting Years to Decades Trick¶

Separate the "old" (before 1969) and "New" (1970s onwards) Films¶

Seaborn Regression Plots¶

Run Your Own Regression with scikit-learn¶

Use Your Model to Make a Prediction¶

	Rank	Release_Date	Movie_Title	USD_Production_Budget	USD_Worldwide_Gross	USD_Domestic_Gross
0	5293	1915-08-02	The Birth of a Nation	110000	11000000	10000000
1	5140	1916-05-09	Intolerance	385907	0	0
2	5230	1916-12-24	20,000 Leagues Under the Sea	200000	8000000	8000000
3	5299	1920-09-17	Over the Hill to the Poorhouse	100000	3000000	3000000
4	5222	1925-01-01	The Big Parade	245000	22000000	11000000